Main
Read in and process data
# Setup
require(here)
source(here("mikk_genome", "code", "scripts", "setup.R"))
# Create function to read in data and bind into single DF
read_n_bind = function(data_path_pref, dataset){
# Set path
path = paste(data_path_pref, dataset, sep = "")
# Read in data
data_files <- list.files(path,
full.names = T)
data_files_trunc <- list.files(path)
data_files_trunc <- gsub(".txt", "", data_files_trunc)
data_list <- lapply(data_files, function(data_file){
df <- read.delim(data_file,
sep = "\t",
header = T)
return(df)
})
names(data_list) <- as.integer(data_files_trunc)
# reorder
data_list <- data_list[order(as.integer(names(data_list)))]
# bind into DF
out_df = dplyr::bind_rows(data_list, .id = "chr")
out_df$chr <- factor(out_df$chr, levels = seq(1, 24))
# get kb measure
out_df$bin_bdr_kb <- out_df$bin_bdr / 1000
return(out_df)
}
# Run over both datasets
datasets = c("mikk", "1kg")
final_lst = lapply(datasets, function(x) read_n_bind("ld/20200727_mean_r2_10kb-lim_", x))
names(final_lst) = datasets
# Combine into single DF
r2_final_df <- dplyr::bind_rows(final_lst, .id = "dataset")
# Write table to repo
write.table(r2_final_df,
file = here::here("mikk_genome", "data", "20200803_r2_10kb-lim.csv"),
quote = F, sep = ",", row.names = F, col.names = T)
Plot
# Tidy data for final plot
r2_final_df$chr = factor(r2_final_df$chr, levels = seq(1, 24))
r2_final_df$dataset = toupper(r2_final_df$dataset)
# Plot
r2_plot_main = r2_final_df %>% ggplot() +
geom_line(aes(bin_bdr_kb, mean, colour = chr)) +
theme_cowplot() +
xlab("Distance between SNPs (kb)") +
ylab(bquote(.("Mean r")^2)) +
facet_wrap(~dataset, nrow = 1, ncol = 2) +
theme(panel.grid = element_blank(),
strip.background = element_blank(),
legend.position = c(0.9, .8)) +
labs(colour = "Chromosome") +
scale_y_continuous(breaks = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6),
limits = c(0.05, 0.6))
r2_plot_main

ggplotly(r2_plot_main)
the condition has length > 1 and only the first element will be usedthe condition has length > 1 and only the first element will be used
# Save plot to repo
ggsave(filename = paste("20200803_mean-r2_10kb-lim_1KGvMIKK_single", ".svg", sep = ""),
plot = r2_plot_main,
device = "svg",
path = here::here("plots", "ld_decay"),
width = 25,
height = 13,
units = "cm")
Inset
100-bp windows
# Read in data
r2_df_1kb_mikk = read_n_bind("ld/20200803_mean_r2_1kb-lim_", "mikk")
# Write table to repo
write.table(r2_df_1kb_mikk,
file = here::here("mikk_genome", "data", "20200803_r2_1kb-lim_mikk.csv"),
quote = F, sep = ",", row.names = F, col.names = T)
# Process for plotting
r2_df_1kb_mikk$chr <- factor(r2_df_1kb_mikk$chr, levels = seq(1, 24))
# Plot
r2_1kb_mikk = r2_df_1kb_mikk %>% ggplot() +
geom_line(aes(bin_bdr, mean, colour = chr)) +
theme_bw() +
xlab("Distance beetween SNPs (bp)") +
ylab(bquote(.("Mean r")^2)) +
labs(colour = "Chromosome") +
theme(panel.grid = element_blank(),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14)) +
guides(colour = F) +
scale_x_continuous(limits = c(0, 1000)) +
scale_y_continuous(breaks = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6),
limits = c(0.05, 0.6))
r2_1kb_mikk

# Save to repo
ggsave(filename = paste("20200803_mean-r2_1kb-lim_MIKK_inset_100bp-bins", ".png", sep = ""),
plot = r2_1kb_mikk,
device = "png",
path = here::here("mikk_genome", "plots"),
width = 10.88,
height = 8,
units = "cm",
dpi = 500)
10-bp windows
For a finer resolution.
Get means for each bin
script=mikk_genome/code/scripts/20200724_r2_decay_mean_1gk_1kb-lim.R
out_dir=ld/20200727_mean_r2_1kb-lim_mikk
for in_file in $(find ld/20200727_mikk_maf-0.10_window-50kb_no-missing/*ld); do
name=$(basename $in_file | cut -f1 -d".");
bsub \
-M 30000 \
-o log/20200803_$name\_mean-r2_1kb-max.out \
-e log/20200803_$name\_mean-r2_1kb-max.err \
"Rscript \
--vanilla \
$script \
$in_file \
$out_dir";
done
# Combine in R
data_files <- list.files("ld/20200727_mean_r2_1kb-lim_mikk",
full.names = T)
data_files_trunc <- list.files("ld/20200727_mean_r2_1kb-lim_mikk")
data_files_trunc <- gsub(".txt", "", data_files_trunc)
data_list <- lapply(data_files, function(data_file){
df <- read.delim(data_file,
sep = "\t",
header = T)
return(df)
})
names(data_list) <- as.integer(data_files_trunc)
# reorder
data_list <- data_list[order(as.integer(names(data_list)))]
# bind into DF
r2_df_1kb_mikk <- dplyr::bind_rows(data_list, .id = "chr")
r2_df_1kb_mikk$chr <- factor(r2_df_1kb_mikk$chr, levels = seq(1, 24))
# write to table
write.table(r2_df_1kb_mikk, here::here("mikk_genome", "data", "20200803_mikk_ld-decay_1kb-lim_10bp-windows.txt"),
quote = F, row.names = F, col.names = T, sep = "\t")
Plot
# Read in data
r2_df_1kb_mikk = read.table(here::here("data", "20200803_mikk_ld-decay_1kb-lim_10bp-windows.txt"),
header = T, sep = "\t", as.is = T)
# Factorise chromosomes
r2_df_1kb_mikk$chr <- factor(r2_df_1kb_mikk$chr, levels = seq(1, 24))
# Plot
r2_df_1kb_mikk %>% ggplot() +
geom_line(aes(bin_bdr, mean, colour = chr)) +
theme_bw() +
xlab("Distance beetween SNPs (bp)") +
ylab(bquote(.("Mean r")^2)) +
labs(colour = "Chromosome") +
theme(panel.grid = element_blank(),
axis.text = element_text(size = 12),
axis.title = element_text(size = 16)) +
guides(colour = F) +
scale_y_continuous(breaks = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7),
limits = c(0.05, 0.7))

# Save
ggsave(filename = paste("20200803_mean-r2_1kb-lim_MIKK_inset_10bp-windows", ".png", sep = ""),
device = "png",
path = here::here("mikk_genome", "plots"),
width = 10.88,
height = 8,
units = "cm",
dpi = 500)